Link to kaggle, where we found our data set




OLS Analysis

lm_spec <-
    linear_reg() %>%
    set_engine(engine = 'lm') %>%
    set_mode('regression')

OLS Recipe and Workflow Setup

lm_rec <- recipe(median_house_value ~ ., data = housing) %>%
  step_normalize(all_numeric_predictors())

lm_wf <- workflow() %>%
  add_recipe(lm_rec) %>%
  add_model(lm_spec)

housing_lm_wf_1 <- workflow() %>%
  add_recipe(lm_rec) %>%
  add_model(lm_spec)%>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

housing_lm_wf_2 <- workflow() %>%
  add_formula(median_house_value ~ median_income)%>%
  add_model(lm_spec)%>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

housing_lm_wf_3 <- workflow() %>%
  add_formula(median_house_value ~ median_income + longitude)%>%
  add_model(lm_spec)%>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

housing_lm_wf_4 <- workflow() %>%
  add_formula(median_house_value ~ median_income + longitude + latitude)%>%
  add_model(lm_spec)%>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

housing_lm_wf_5 <- workflow() %>%
  add_formula(median_house_value ~ median_income + longitude + latitude + total_rooms)%>%
  add_model(lm_spec)%>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

housing_lm_wf_6 <- workflow() %>%
  add_formula(median_house_value ~ median_income + longitude + latitude + total_rooms + population)%>%
  add_model(lm_spec)%>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

OLS Fit and Tune Model

mod1 <- fit(lm_spec,
            median_house_value ~ .,
            data = housing)
mod2 <- fit(lm_spec,
            median_house_value ~ median_income,
            data = housing)
mod3 <- fit(lm_spec,
            median_house_value ~ median_income + longitude,
            data = housing)
mod4 <- fit(lm_spec,
            median_house_value ~ median_income + longitude + latitude,
            data = housing)
mod5 <- fit(lm_spec,
            median_house_value ~ median_income + longitude + latitude + total_rooms,
            data = housing)
mod6 <- fit(lm_spec,
            median_house_value ~ median_income + longitude + latitude + total_rooms + population,
            data = housing)

mod1 %>% 
  tidy() %>% 
  slice(-1) %>% 
  mutate(lower = estimate - 1.96*std.error, upper = estimate + 1.96*std.error) %>%
  ggplot() + 
    geom_vline(xintercept=0, linetype=4) + 
    geom_point(aes(x=estimate, y=term)) + 
    geom_segment(aes(y=term, yend=term, x=lower, xend=upper), 
                 arrow = arrow(angle=90, ends='both', length = unit(0.1, 'cm'))) + 
    labs(x = 'Coefficient estimate (95% CI)', y = 'Feature') +
    theme_classic()

mod1 %>%
  tidy()
## # A tibble: 13 × 5
##    term                  estimate std.error statistic   p.value
##    <chr>                    <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)        -2273908.   88456.       -25.7  1.88e-143
##  2 longitude            -26813.    1020.       -26.3  6.60e-150
##  3 latitude             -25482.    1005.       -25.4  9.39e-140
##  4 housing_median_age     1073.      43.9       24.4  4.85e-130
##  5 total_rooms              -6.19     0.791     -7.83 5.32e- 15
##  6 total_bedrooms          101.       6.87      14.6  2.73e- 48
##  7 population              -38.0      1.08     -35.3  9.35e-265
##  8 households               49.6      7.45       6.66 2.83e- 11
##  9 median_income         39260.     338.       116.   0        
## 10 ocean_proximity2       3954.    1913.         2.07 3.88e-  2
## 11 ocean_proximity3       8232.    2176.         3.78 1.55e-  4
## 12 ocean_proximity4     156856.   30778.         5.10 3.49e-  7
## 13 ocean_proximity5     -35330.    2336.       -15.1  2.23e- 51
mod1_output <- mod1 %>% 
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)

mod2_output <- mod2 %>% 
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)

mod3_output <- mod3 %>% 
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)

mod4_output <- mod4 %>% 
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)

mod5_output <- mod5 %>% 
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)

mod6_output <- mod6 %>% 
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)
mod1_output %>%
    mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard      49748.
mod2_output %>%
    mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard      62598.
mod3_output %>%
    mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard      62486.
mod4_output %>%
    mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard      54807.
mod5_output %>%
    mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard      54780.
mod6_output %>%
    mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard      53892.

OLS Visualize Model

ggplot(mod1_output, aes(y=resid, x=median_house_value)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

#not very useful?
ggplot(mod1_output, aes(y=resid, x=housing_median_age)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

#not very useful 
ggplot(mod1_output, aes(y=resid, x=ocean_proximity)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod1_output, aes(y=resid, x=longitude)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod1_output, aes(y=resid, x=latitude)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod1_output, aes(y=resid, x=total_rooms)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod1_output, aes(y=resid, x=median_income)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod1_output, aes(y=resid, x=total_bedrooms)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod2_output, aes(y=resid, x=total_rooms)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod2_output, aes(y=resid, x=median_house_value)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod2_output, aes(y=resid, x=longitude)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod2_output, aes(y=resid, x=latitude)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod2_output, aes(y=resid, x=median_income)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod2_output, aes(y=resid, x=population)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod3_output, aes(y=resid, x=population)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod3_output, aes(y=resid, x=total_rooms)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

ggplot(mod6_output, aes(y=resid, x=longitude)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

OLS Calculate and Collect Metrics

mod1_cv <- fit_resamples(housing_lm_wf_1,
  resamples = data_cv10, 
  metrics = metric_set(rmse, rsq, mae))
  
  
mod2_cv <- fit_resamples(housing_lm_wf_2,
  resamples = data_cv10, 
  metrics = metric_set(rmse, rsq, mae))
  
  
mod3_cv <- fit_resamples(housing_lm_wf_3,
  resamples = data_cv10, 
  metrics = metric_set(rmse, rsq, mae))
  
  
mod4_cv <- fit_resamples(housing_lm_wf_4,
  resamples = data_cv10, 
  metrics = metric_set(rmse, rsq, mae))
  
  
mod5_cv <- fit_resamples(housing_lm_wf_5,
  resamples = data_cv10, 
  metrics = metric_set(rmse, rsq, mae))

  
mod6_cv <- fit_resamples(housing_lm_wf_6,
  resamples = data_cv10, 
  metrics = metric_set(rmse, rsq, mae))
mod1_cv %>% unnest(.metrics) %>%
  filter(.metric == 'rmse') %>%
  summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
##   RMSE_CV
##     <dbl>
## 1  68711.
mod2_cv %>% unnest(.metrics) %>%
  filter(.metric == 'rmse') %>%
  summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
##   RMSE_CV
##     <dbl>
## 1  83701.
mod3_cv %>% unnest(.metrics) %>%
  filter(.metric == 'rmse') %>%
  summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
##   RMSE_CV
##     <dbl>
## 1  83610.
mod4_cv %>% unnest(.metrics) %>%
  filter(.metric == 'rmse') %>%
  summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
##   RMSE_CV
##     <dbl>
## 1  74421.
mod5_cv %>% unnest(.metrics) %>%
    filter(.metric == 'rmse') %>%
    summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
##   RMSE_CV
##     <dbl>
## 1  74386.
mod6_cv %>% unnest(.metrics) %>%
    filter(.metric == 'rmse') %>%
    summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
##   RMSE_CV
##     <dbl>
## 1  73070.
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 mae     standard   49798.       10 435.      Preprocessor1_Model1
## 2 rmse    standard   68711.       10 798.      Preprocessor1_Model1
## 3 rsq     standard       0.646    10   0.00588 Preprocessor1_Model1
mod2_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 mae     standard   62609.       10 532.      Preprocessor1_Model1
## 2 rmse    standard   83701.       10 888.      Preprocessor1_Model1
## 3 rsq     standard       0.474    10   0.00782 Preprocessor1_Model1
mod3_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 mae     standard   62503.       10 553.      Preprocessor1_Model1
## 2 rmse    standard   83610.       10 898.      Preprocessor1_Model1
## 3 rsq     standard       0.475    10   0.00798 Preprocessor1_Model1
mod4_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 mae     standard   54826.       10 478.      Preprocessor1_Model1
## 2 rmse    standard   74421.       10 791.      Preprocessor1_Model1
## 3 rsq     standard       0.584    10   0.00545 Preprocessor1_Model1
mod5_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 mae     standard   54802.       10 474.      Preprocessor1_Model1
## 2 rmse    standard   74386.       10 792.      Preprocessor1_Model1
## 3 rsq     standard       0.585    10   0.00552 Preprocessor1_Model1
mod6_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 mae     standard   53916.       10 442.      Preprocessor1_Model1
## 2 rmse    standard   73070.       10 799.      Preprocessor1_Model1
## 3 rsq     standard       0.599    10   0.00578 Preprocessor1_Model1




LASSO Analysis

LASSO Model spec, recipe, and workflow

lasso_spec <- 
  linear_reg() %>%
  set_args(mixture = 1, penalty = tune()) %>% ## mixture = 1 indicates Lasso, we'll choose penalty later
  set_engine(engine = 'glmnet') %>% #note we are using a different engine
  set_mode('regression') 
lasso_rec <- recipe(median_house_value ~ ., data = housing) %>%
    step_nzv(all_predictors()) %>% # removes variables with the same value
    #step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables 
    step_normalize(all_numeric_predictors()) %>%  # important standardization step for LASSO
    step_dummy(all_nominal_predictors())  # creates indicator variables for categorical variables

lasso_wf <- workflow() %>% 
  add_recipe(lasso_rec) %>%
  add_model(lasso_spec)

LASSO Fit and Tune Model

# Tune LASSO #1
penalty_grid <- grid_regular(
  penalty(range = c(-5,5)), #log10 transformed 10^-5 to 10^3
  levels = 30)

# Tune LASSO #2
tune_res <- tune_grid( # new function for tuning parameters
  lasso_wf, # workflow
  resamples = data_cv10, # cv folds
  metrics = metric_set(rmse, mae),
  grid = penalty_grid)

# LASSO model
lasso_fit <- lasso_wf %>% 
  fit(data = housing) # Fit to data

LASSO Calculate and Collect Metrics

# plotting LASSO lambda
plot(lasso_fit %>% 
       extract_fit_parsnip() %>% 
       pluck('fit'), # way to get the original glmnet output
     xvar = "lambda")

# Visualize LASSO Metrics from Tuning
autoplot(tune_res) + theme_classic()

# Summarize LASSO CV Metrics
collect_metrics(tune_res) %>%
  filter(.metric == 'mae') %>% # or choose mae
  select(penalty, mae = mean) 
## # A tibble: 30 × 2
##      penalty    mae
##        <dbl>  <dbl>
##  1 0.00001   49791.
##  2 0.0000221 49791.
##  3 0.0000489 49791.
##  4 0.000108  49791.
##  5 0.000240  49791.
##  6 0.000530  49791.
##  7 0.00117   49791.
##  8 0.00259   49791.
##  9 0.00574   49791.
## 10 0.0127    49791.
## # … with 20 more rows
# choose penalty value based on lowest mae or rmse
best_penalty <- select_best(tune_res, metric = 'mae') 
best_penalty
## # A tibble: 1 × 2
##   penalty .config              
##     <dbl> <chr>                
## 1    78.8 Preprocessor1_Model21
# choose penalty value based on the largest penalty within 1 se of the lowest CV MAE
best_se_penalty <- select_by_one_std_err(tune_res, metric = 'mae', desc(penalty)) 


# Fit Final LASSO Models
final_wf <- finalize_workflow(lasso_wf, best_penalty) # incorporates penalty value to workflow
final_wf_se <- finalize_workflow(lasso_wf, best_se_penalty) # incorporates penalty value to workflow


final_fit <- fit(final_wf, data = housing)
final_fit_se <- fit(final_wf_se, data = housing)

tidy(final_fit)
## # A tibble: 13 × 3
##    term               estimate penalty
##    <chr>                 <dbl>   <dbl>
##  1 (Intercept)         216740.    78.8
##  2 longitude           -50809.    78.8
##  3 latitude            -51528.    78.8
##  4 housing_median_age   13453.    78.8
##  5 total_rooms         -11697.    78.8
##  6 total_bedrooms       40214.    78.8
##  7 population          -42467.    78.8
##  8 households           18813.    78.8
##  9 median_income        74149.    78.8
## 10 ocean_proximity_X2    2872.    78.8
## 11 ocean_proximity_X3    7510.    78.8
## 12 ocean_proximity_X4  152032.    78.8
## 13 ocean_proximity_X5  -38212.    78.8
tidy(final_fit_se)
## # A tibble: 13 × 3
##    term               estimate penalty
##    <chr>                 <dbl>   <dbl>
##  1 (Intercept)         223724.    853.
##  2 longitude           -28950.    853.
##  3 latitude            -28678.    853.
##  4 housing_median_age   13219.    853.
##  5 total_rooms              0     853.
##  6 total_bedrooms       26949.    853.
##  7 population          -36168.    853.
##  8 households           14685.    853.
##  9 median_income        71425.    853.
## 10 ocean_proximity_X2       0     853.
## 11 ocean_proximity_X3    6385.    853.
## 12 ocean_proximity_X4  109045.    853.
## 13 ocean_proximity_X5  -55699.    853.
  • use estimate to talk abiout how changes in coefficient leads to changes in response variable

LASSO Visualize residuals

lasso_mod_out <- final_fit_se %>%
    predict(new_data = housing) %>%
    bind_cols(housing) %>%
    mutate(resid = median_house_value - .pred)

lasso_mod_out %>% 
  ggplot(aes(x = .pred, y = resid)) + 
  geom_point() +
  geom_smooth(se = FALSE) + 
  geom_hline(yintercept = 0) + 
  theme_classic()

  • there’s a straight line cuz of the cap at 500000 for housing value

\(~\)

C.Compare estimated test performance across the models. Which models(s) might you prefer?

  • Using the measures calculated using the ordinary least squares. Also, using the average cross validated rmse model 1 performed the bed. The first model including all the variables is the best. This is also validated using the best lambda measure calculated using lasso.

\(~\)

D. Use residual plots to evaluate whether some quantitative predictors might be better modeled with nonlinear relationships.

  • Latitude and longitude might be better modeled with a nonlinear relationship as there appears to be two spikes in their residual plots.

\(~\)

E. Which variables do you think are the most important predictors of your quantitative outcome? Justify your answer. Do the methods you’ve applied reach consensus on which variables are most important? What insights are expected? Surprising?

  • The most important predictor according to the lasso is median income. It survives the longest. Median income also has the lowest p value. This is not surprising as a higher income would suggest the neighboring household’s ability to purchase a more expensive home. Using p value population the relationship observed in population has the second lowest probability of being caused by random chance. All that’s being said, however, p value is not always the best indicator of which variables to include in the model.




Summarize Investigations

Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?

  • The model that contains all of the variables in the data set is the best model so far. Based on the fact that that model yielded the lowest CV test error rate, it seems like taking into account of house age, number of rooms, location, ocean proximity, total bedrooms, population, and income level of that block of houses is important to determine house value.

  • The interpretability of our model is pretty staright forward, though we would like to find out if some individual variables could have more influence on house value and make predictions with that variable.

\(~\) \(~\)

Societal Impact

Are there any harms that may come from your analyses and/or how the data were collected? What cautions do you want to keep in mind when communicating your work?

  • The variable median_income measures the median income for households within a block of houses. Since this variable seems to be the most persistent in our model, according to our LASSO results, this confirms the seriously large wage gap in the United States. Income affects one’s social status, health care access, and even house value. We could further infer from our preliminary analysis that income affects house value because it is more likely that those who have a higher income are not criminals. Where criminal activity is low, housing options there attracts more buyers — especially buyers of higher social status.

  • The data was derived from Aurélien Géron’s book ‘Hands-On Machine learning with Scikit-Learn and TensorFlow’. It contains the 1990 California census information, specifically on housing. We don’t suspect that the way in which the data was collected is harmful. But we do need to acknowledge that those who did respond to census letters are probably those who have a mailing address and those who are middle to higher income groups. Back in the days, I don’t think there census surveys are delivered eletronically and therefore data collection must have had missed people who do not have a mailing address or P.O. box. Additionally, people who are middle to higher income groups would more likely respond to the census because they can read and write English, they might be more informed about the purpose of a census survey, and they might just have more time to attend to a list of questions printed on multiple pages of paper. People who had lower income and probably had to work so many more hours just to meet ends and the census letter could be the last on their minds. And keep in mind that the data set is specifically on California housing, which is arguably one of the more wealthy and liberal states in the US.







Project Work Part 2

  1. Accounting for non-linearity
    • Update your models to use natural splines for some of the quantitative predictors to account for non-linearity (these are GAMs).
      • I recommend using OLS engine to fit these final models.
      • You’ll need to update the recipe to include step_ns() for each quantitative predictor that you want to allow to be non-linear.
      • To determine number of knots (deg_free), I recommend fitting a smoothing spline and use edf to inform your choice.
# Linear Regression Model Spec
lm_spec <- 
  linear_reg() %>%
  set_engine(engine = 'lm') %>%
  set_mode('regression')

lm_rec <- recipe(median_house_value ~ ., data = housing)

ns_rec <- lm_rec %>%
  step_ns(longitude, deg_free = 10) %>% 
  step_ns(latitude, deg_free = 10) %>% 
  step_ns(housing_median_age, deg_free = 10) %>% 
  step_ns(total_rooms, deg_free = 10) %>% 
  step_ns(total_bedrooms, deg_free = 10) %>% 
  step_ns(households, deg_free = 10) %>% 
  step_ns(population, deg_free = 10)

#degrees of freedom has to be a whole number and larger than edf 
  
ns_wf <- workflow() %>%
  add_model(lm_spec) %>%
  add_recipe(ns_rec)
data_cv10<- vfold_cv(housing, v = 10)

cv_output <- fit_resamples(
  ns_wf,
  resamples = data_cv10, # cv folds
  metrics = metric_set(mae))

cv_output %>% collect_metrics()
## # A tibble: 1 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 mae     standard   44580.    10    222. Preprocessor1_Model1
# Fit with all data
# ns_mod <- fit(
#   lm_spec, #workflow
#   data = housing)

\(~\)

gam_spec <- 
  gen_additive_mod() %>%
  set_engine(engine = 'mgcv') %>%
  set_mode('regression') 

gam_mod <- fit(gam_spec,
    median_house_value ~ ocean_proximity + s(longitude) + s(latitude) + 
      s(housing_median_age) + s(total_rooms) + s(total_bedrooms) + 
      s(population) + s(households) + s(median_income),
    data = housing)

\(~\)

par(mfrow=c(2,2))

gam_mod %>% pluck('fit') %>% mgcv::gam.check() 

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 14 iterations.
## The RMS GCV score gradient at convergence was 269.7354 .
## The Hessian was positive definite.
## Model rank =  77 / 77 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                         k'  edf k-index p-value    
## s(longitude)          9.00 8.95    0.89  <2e-16 ***
## s(latitude)           9.00 8.90    0.90  <2e-16 ***
## s(housing_median_age) 9.00 8.86    1.01   0.700    
## s(total_rooms)        9.00 7.00    1.00   0.385    
## s(total_bedrooms)     9.00 9.00    1.01   0.875    
## s(population)         9.00 8.17    0.94  <2e-16 ***
## s(households)         9.00 4.95    1.01   0.690    
## s(median_income)      9.00 8.15    0.97   0.005 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_mod %>% pluck('fit') %>% summary() 
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## median_house_value ~ ocean_proximity + s(longitude) + s(latitude) + 
##     s(housing_median_age) + s(total_rooms) + s(total_bedrooms) + 
##     s(population) + s(households) + s(median_income)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        211974       1968 107.712  < 2e-16 ***
## ocean_proximity2    -3028       2270  -1.334 0.182359    
## ocean_proximity3    21981       2460   8.936  < 2e-16 ***
## ocean_proximity4   103433      27675   3.737 0.000186 ***
## ocean_proximity5   -20833       2664  -7.820 5.54e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df        F p-value    
## s(longitude)          8.950  8.999  130.752  <2e-16 ***
## s(latitude)           8.901  8.997  146.490  <2e-16 ***
## s(housing_median_age) 8.862  8.993   64.501  <2e-16 ***
## s(total_rooms)        7.004  8.188    8.413  <2e-16 ***
## s(total_bedrooms)     9.000  9.000   39.516  <2e-16 ***
## s(population)         8.174  8.754  333.367  <2e-16 ***
## s(households)         4.951  6.171   28.213  <2e-16 ***
## s(median_income)      8.151  8.804 1179.025  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.718   Deviance explained = 71.9%
## GCV = 3.7731e+09  Scale est. = 3.7604e+09  n = 20433

\(~\)

  1. Compare insights from variable importance analyses here and the corresponding results from the Investigation 1. Now after having accounted for non-linearity, have the most relevant predictors changed?

\(~\)

  1. Do you gain any insights from the GAM output plots (easily obtained from fitting smoothing splines) for each predictor?

<<<<<<< HEAD - the GAM output plots show us that total_rooms, total_bedrooms, households, and population become less accurate in predicting near the upper ends. Population seems to be the least accurate in predicting using smoothing splines. ======= - The GAM output plots show us that total_rooms, total_bedrooms, households, and population become less accurate in predicting near the upper ends. Population seems to be the least accurate in predicting using smoothing splines. >>>>>>> 2f4df9723325062f19aa1e62eeb1be1cedd684aa

gam_mod %>% pluck('fit') %>% plot(page = 1)

\(~\)

  1. Compare model performance between your GAM models and the models that assumes linearity.

\(~\)

  1. How does test performance of the GAMs compare to other models you explored?

Below are the MAEs yeilded from our models thus far: 1) OLS: 49797.67 2) LASSO: 49791.43
3) GAMs: 44580.23


  1. Summarize investigations

Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?


  1. Societal impact

Are there any harms that may come from your analyses and/or how the data were collected?

\(~\)

What cautions do you want to keep in mind when communicating your work?




Project Work for HW 4

  1. Specify the research question for a classification task.
  • Could the median income of the home owner and population of the community predict house value?

\(~\)

  1. Try to implement at least 2 different classification methods to answer your research question.
  • Decision tree and K-means clustering

\(~\)

  1. Reflect on the information gained from these two methods and how you might justify this method to others.

Keep in mind that the final project will require you to complete the pieces below. Use this as a guide for your work but don’t try to accomplish everything for HW4:

Classification - Methods - Indicate at least 2 different methods used to answer your classification research question. - Describe what you did to evaluate the models explored. - Indicate how you estimated quantitative evaluation metrics. - Describe the goals / purpose of the methods used in the overall context of your research investigations.

Classification - Results - Summarize your final model and justify your model choice (see below for ways to justify your choice). - Compare the different classification models tried in light of evaluation metrics, variable importance, and data context. - Display evaluation metrics for different models in a clean, organized way. This display should include both the estimated metric as well as its standard deviation. (This won’t be available from OOB error estimation. If using OOB, don’t worry about reporting the SD.) - Broadly summarize conclusions from looking at these evaluation metrics and their measures of uncertainty.

Classification - Conclusions - Interpret evaluation metric(s) for the final model in context. Does the model show an acceptable amount of error? - If using OOB error estimation, display the test (OOB) confusion matrix, and use it to interpret the strengths and weaknesses of the final model. - Summarization should show evidence of acknowledging the data context in thinking about the sensibility of these results.



K-Means Clustering on Housing Data

Set up K-Means by selecting variables that had the most influence based on previous data analysis (i.e., HW2)

housing_popIncome <- housing %>% 
  select(population, median_income)

housing_bedroomAge <- housing %>% 
  select(housing_median_age, total_bedrooms)

set.seed(620)

\(~\)

Create kclust variable for median income and population and graph it

kclust_k3 <- kmeans(housing_popIncome, centers = 3)
kclust_k3_scale <- kmeans(scale(housing_popIncome), centers = 3)

housing_PI <- housing %>%
  mutate(kclust_3 = factor(kclust_k3$cluster)) %>% 
  mutate(kclust_3_scale = factor(kclust_k3_scale$cluster))

ggplot(housing_PI, aes(x = population, y = median_income, color = kclust_3)) +
  geom_point() +
  labs(title = "K-Means clustering on median income and population") +
  theme_classic()

ggplot(housing_PI, aes(x = population, y = median_income, color = kclust_3_scale)) +
  geom_point() +
  labs(title = "K-Means with scaled clustering on median income and population") +
  theme_classic()

summary(housing_popIncome)
##    population    median_income    
##  Min.   :    3   Min.   : 0.4999  
##  1st Qu.:  787   1st Qu.: 2.5637  
##  Median : 1166   Median : 3.5365  
##  Mean   : 1425   Mean   : 3.8712  
##  3rd Qu.: 1722   3rd Qu.: 4.7440  
##  Max.   :35682   Max.   :15.0001

\(~\)

Exploring K-Means clustering for other variables: total bedrooms and house age

kclust_k3_BA <- kmeans(housing_bedroomAge, centers = 3)
kclust_k3_scale_BA <- kmeans(scale(housing_bedroomAge), centers = 3)

housing_BA <- housing %>%
  mutate(kclust_3_BA = factor(kclust_k3_BA$cluster)) %>% 
  mutate(kclust_3_scale_BA = factor(kclust_k3_scale_BA$cluster))

ggplot(housing_BA, aes(x = total_bedrooms, y = housing_median_age, color = kclust_3_BA)) +
  geom_point() +
  labs(title = "K-Means clustering on number of bedrooms and house median age") +
  theme_classic()

ggplot(housing_BA, aes(x = total_bedrooms, y = housing_median_age, color = kclust_3_scale_BA)) +
  geom_point() +
  labs(title = "K-Means with scaled clustering on number of bedrooms and house median age") +
  theme_classic()

summary(housing_bedroomAge)
##  housing_median_age total_bedrooms  
##  Min.   : 1.00      Min.   :   1.0  
##  1st Qu.:18.00      1st Qu.: 296.0  
##  Median :29.00      Median : 435.0  
##  Mean   :28.63      Mean   : 537.9  
##  3rd Qu.:37.00      3rd Qu.: 647.0  
##  Max.   :52.00      Max.   :6445.0

\(~\)

Interpret K-Means results

  • Between the two K-Means Clustering, the two most persistent variables, median income and population, does not seem to relate to one another when we look at their clusters. Median income may increase, so does population, but that does not necessarily mean that houses in those categories have higher predicted value.

  • On the contrary, number of bedrooms and median house age seem to relate to one another more. Houses that are newer have more bedrooms and have a pretty good house value.

housing_PI %>%
    group_by(kclust_3_scale) %>%
    summarize(across(c(median_income, population, median_house_value), mean))

housing_BA %>%
    group_by(kclust_3_scale) %>%
    summarize(across(c(housing_median_age, total_bedrooms, median_house_value), mean))

Decision Trees

# Make sure you understand what each line of code is doing
set.seed(123) # don't change this

data_fold <- vfold_cv(housing, v = 10)

ct_spec_tune <- decision_tree() %>%
  set_engine(engine = 'rpart') %>%
  set_args(cost_complexity = tune(),  
           min_n = 2, 
           tree_depth = NULL) %>% 
  set_mode('classification') 

data_rec <- recipe(ocean_proximity ~ ., data = housing)

data_wf_tune <- workflow() %>%
  add_model(ct_spec_tune) %>%
  add_recipe(data_rec)

param_grid <- grid_regular(cost_complexity(range = c(-5, 1)), levels = 10) 

tune_res <- tune_grid(
  data_wf_tune, 
  resamples = data_fold, 
  grid = param_grid, 
  metrics = metric_set(accuracy) #change this for regression trees
)
best_complexity <- select_by_one_std_err(tune_res, metric = 'accuracy', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf_tune, best_complexity)

housing_final_fit <- fit(data_wf_final, data = housing)


tune_res %>% 
  collect_metrics() %>%
  filter(cost_complexity == best_complexity$cost_complexity)
## # A tibble: 1 × 7
##   cost_complexity .metric  .estimator  mean     n  std_err .config              
##             <dbl> <chr>    <chr>      <dbl> <int>    <dbl> <chr>                
## 1       0.0000464 accuracy multiclass 0.978    10 0.000875 Preprocessor1_Model02
housing_final_fit %>% extract_fit_engine() %>% rpart.plot()